#Time series project on big data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#loading few particular csv files from the dataset
path = r'E:\Placement 2022-23\Analytics\projects\2-Time Series Data Analysis-20220907T085235Z-001\2-Time Series Data Analysis\individual_stocks_5yr'
company_list=['AAPL_data.csv','GOOG_data.csv','MSFT_data.csv','AMZN_data.csv']
all_data = pd.DataFrame()
for file in company_list:
current_df = pd.read_csv(path+'/'+file)
all_data = pd.concat([all_data,current_df])
all_data.shape
(4752, 7)
all_data.head()
| date | open | high | low | close | volume | Name | |
|---|---|---|---|---|---|---|---|
| 0 | 2013-02-08 | 67.7142 | 68.4014 | 66.8928 | 67.8542 | 158168416 | AAPL |
| 1 | 2013-02-11 | 68.0714 | 69.2771 | 67.6071 | 68.5614 | 129029425 | AAPL |
| 2 | 2013-02-12 | 68.5014 | 68.9114 | 66.8205 | 66.8428 | 151829363 | AAPL |
| 3 | 2013-02-13 | 66.7442 | 67.6628 | 66.1742 | 66.7156 | 118721995 | AAPL |
| 4 | 2013-02-14 | 66.3599 | 67.3771 | 66.2885 | 66.6556 | 88809154 | AAPL |
all_data.dtypes
date object open float64 high float64 low float64 close float64 volume int64 Name object dtype: object
all_data['date'] = pd.to_datetime(all_data['date'])
all_data.dtypes
date datetime64[ns] open float64 high float64 low float64 close float64 volume int64 Name object dtype: object
#analysing the closing time of the stocks of the major stock holders as taken above.
tech_list = all_data['Name'].unique()
#iterating over the tech list
plt.figure(figsize=(10,7))
for i,company in enumerate(tech_list,1):
plt.subplot(2,2,i)
df = all_data[all_data['Name']==company]
plt.plot(df['date'],df['close'])
plt.xticks(rotation='vertical')
plt.title(company)
#to view the dates in better way, have to change/convert the string to date format.
#analysing the total volume of stock traded each day
import plotly.express as px
#iterating the list
for company in tech_list:
df = all_data[all_data['Name']==company]
fig= px.line(df,x='date',y='volume',title=company)
fig.show()
#now to zoom in or check a particular data, use the plotly function or feature to cut out that part.
df.head()
| date | open | high | low | close | volume | Name | |
|---|---|---|---|---|---|---|---|
| 0 | 2013-02-08 | 261.40 | 265.25 | 260.555 | 261.95 | 3879078 | AMZN |
| 1 | 2013-02-11 | 263.20 | 263.25 | 256.600 | 257.21 | 3403403 | AMZN |
| 2 | 2013-02-12 | 259.19 | 260.16 | 257.000 | 258.70 | 2938660 | AMZN |
| 3 | 2013-02-13 | 261.53 | 269.96 | 260.300 | 269.47 | 5292996 | AMZN |
| 4 | 2013-02-14 | 267.37 | 270.65 | 265.400 | 269.24 | 3462780 | AMZN |
#analysing daily rise in price in only aapl dataset
df1 = pd.read_csv(r'E:\Placement 2022-23\Analytics\projects\2-Time Series Data Analysis-20220907T085235Z-001\2-Time Series Data Analysis\individual_stocks_5yr\AAPL_data.csv')
df1.head()
| date | open | high | low | close | volume | Name | |
|---|---|---|---|---|---|---|---|
| 0 | 2013-02-08 | 67.7142 | 68.4014 | 66.8928 | 67.8542 | 158168416 | AAPL |
| 1 | 2013-02-11 | 68.0714 | 69.2771 | 67.6071 | 68.5614 | 129029425 | AAPL |
| 2 | 2013-02-12 | 68.5014 | 68.9114 | 66.8205 | 66.8428 | 151829363 | AAPL |
| 3 | 2013-02-13 | 66.7442 | 67.6628 | 66.1742 | 66.7156 | 118721995 | AAPL |
| 4 | 2013-02-14 | 66.3599 | 67.3771 | 66.2885 | 66.6556 | 88809154 | AAPL |
df.head()
| date | open | high | low | close | volume | Name | |
|---|---|---|---|---|---|---|---|
| 0 | 2013-02-08 | 261.40 | 265.25 | 260.555 | 261.95 | 3879078 | AMZN |
| 1 | 2013-02-11 | 263.20 | 263.25 | 256.600 | 257.21 | 3403403 | AMZN |
| 2 | 2013-02-12 | 259.19 | 260.16 | 257.000 | 258.70 | 2938660 | AMZN |
| 3 | 2013-02-13 | 261.53 | 269.96 | 260.300 | 269.47 | 5292996 | AMZN |
| 4 | 2013-02-14 | 267.37 | 270.65 | 265.400 | 269.24 | 3462780 | AMZN |
df1['Daily_price_change'] = df1['close'] - df1['open']
df1['1day%return'] = ((df1['close'] - df1['open'])/df1['open'])*100
df1.head()
| date | open | high | low | close | volume | Name | Daily_price_change | 1day%return | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2013-02-08 | 67.7142 | 68.4014 | 66.8928 | 67.8542 | 158168416 | AAPL | 0.1400 | 0.206751 |
| 1 | 2013-02-11 | 68.0714 | 69.2771 | 67.6071 | 68.5614 | 129029425 | AAPL | 0.4900 | 0.719832 |
| 2 | 2013-02-12 | 68.5014 | 68.9114 | 66.8205 | 66.8428 | 151829363 | AAPL | -1.6586 | -2.421264 |
| 3 | 2013-02-13 | 66.7442 | 67.6628 | 66.1742 | 66.7156 | 118721995 | AAPL | -0.0286 | -0.042850 |
| 4 | 2013-02-14 | 66.3599 | 67.3771 | 66.2885 | 66.6556 | 88809154 | AAPL | 0.2957 | 0.445600 |
fig = px.line(df1,x='date',y='1day%return',title=company)
fig.show()
#analysing monthly mean of the close data
df2=df1.copy()
df2.dtypes
date object open float64 high float64 low float64 close float64 volume int64 Name object Daily_price_change float64 1day%return float64 dtype: object
df2['date'] = pd.to_datetime(df2['date'])
df2.set_index('date',inplace=True)
df2.head()
| open | high | low | close | volume | Name | Daily_price_change | 1day%return | |
|---|---|---|---|---|---|---|---|---|
| date | ||||||||
| 2013-02-08 | 67.7142 | 68.4014 | 66.8928 | 67.8542 | 158168416 | AAPL | 0.1400 | 0.206751 |
| 2013-02-11 | 68.0714 | 69.2771 | 67.6071 | 68.5614 | 129029425 | AAPL | 0.4900 | 0.719832 |
| 2013-02-12 | 68.5014 | 68.9114 | 66.8205 | 66.8428 | 151829363 | AAPL | -1.6586 | -2.421264 |
| 2013-02-13 | 66.7442 | 67.6628 | 66.1742 | 66.7156 | 118721995 | AAPL | -0.0286 | -0.042850 |
| 2013-02-14 | 66.3599 | 67.3771 | 66.2885 | 66.6556 | 88809154 | AAPL | 0.2957 | 0.445600 |
#grabbing data between any two dates
df2['2013-02-08':'2013-02-13']
| open | high | low | close | volume | Name | Daily_price_change | 1day%return | |
|---|---|---|---|---|---|---|---|---|
| date | ||||||||
| 2013-02-08 | 67.7142 | 68.4014 | 66.8928 | 67.8542 | 158168416 | AAPL | 0.1400 | 0.206751 |
| 2013-02-11 | 68.0714 | 69.2771 | 67.6071 | 68.5614 | 129029425 | AAPL | 0.4900 | 0.719832 |
| 2013-02-12 | 68.5014 | 68.9114 | 66.8205 | 66.8428 | 151829363 | AAPL | -1.6586 | -2.421264 |
| 2013-02-13 | 66.7442 | 67.6628 | 66.1742 | 66.7156 | 118721995 | AAPL | -0.0286 | -0.042850 |
#resamplping by month as monthly mean analysis is being done
df2['close'].resample('M').mean().plot(kind='bar')
<AxesSubplot:xlabel='date'>
#for year
df2['close'].resample('Y').mean().plot(kind='bar')
<AxesSubplot:xlabel='date'>
#analysing the correlation of diff companies stock prices
aapl = pd.read_csv(r'E:\Placement 2022-23\Analytics\projects\2-Time Series Data Analysis-20220907T085235Z-001\2-Time Series Data Analysis\individual_stocks_5yr\AAPL_data.csv')
aapl.head()
| date | open | high | low | close | volume | Name | |
|---|---|---|---|---|---|---|---|
| 0 | 2013-02-08 | 67.7142 | 68.4014 | 66.8928 | 67.8542 | 158168416 | AAPL |
| 1 | 2013-02-11 | 68.0714 | 69.2771 | 67.6071 | 68.5614 | 129029425 | AAPL |
| 2 | 2013-02-12 | 68.5014 | 68.9114 | 66.8205 | 66.8428 | 151829363 | AAPL |
| 3 | 2013-02-13 | 66.7442 | 67.6628 | 66.1742 | 66.7156 | 118721995 | AAPL |
| 4 | 2013-02-14 | 66.3599 | 67.3771 | 66.2885 | 66.6556 | 88809154 | AAPL |
amzn = pd.read_csv(r'E:\Placement 2022-23\Analytics\projects\2-Time Series Data Analysis-20220907T085235Z-001\2-Time Series Data Analysis\individual_stocks_5yr\AMZN_data.csv')
amzn.head()
| date | open | high | low | close | volume | Name | |
|---|---|---|---|---|---|---|---|
| 0 | 2013-02-08 | 261.40 | 265.25 | 260.555 | 261.95 | 3879078 | AMZN |
| 1 | 2013-02-11 | 263.20 | 263.25 | 256.600 | 257.21 | 3403403 | AMZN |
| 2 | 2013-02-12 | 259.19 | 260.16 | 257.000 | 258.70 | 2938660 | AMZN |
| 3 | 2013-02-13 | 261.53 | 269.96 | 260.300 | 269.47 | 5292996 | AMZN |
| 4 | 2013-02-14 | 267.37 | 270.65 | 265.400 | 269.24 | 3462780 | AMZN |
msft = pd.read_csv(r'E:\Placement 2022-23\Analytics\projects\2-Time Series Data Analysis-20220907T085235Z-001\2-Time Series Data Analysis\individual_stocks_5yr\MSFT_data.csv')
msft.head()
| date | open | high | low | close | volume | Name | |
|---|---|---|---|---|---|---|---|
| 0 | 2013-02-08 | 27.35 | 27.71 | 27.31 | 27.55 | 33318306 | MSFT |
| 1 | 2013-02-11 | 27.65 | 27.92 | 27.50 | 27.86 | 32247549 | MSFT |
| 2 | 2013-02-12 | 27.88 | 28.00 | 27.75 | 27.88 | 35990829 | MSFT |
| 3 | 2013-02-13 | 27.93 | 28.11 | 27.88 | 28.03 | 41715530 | MSFT |
| 4 | 2013-02-14 | 27.92 | 28.06 | 27.87 | 28.04 | 32663174 | MSFT |
goog = pd.read_csv(r'E:\Placement 2022-23\Analytics\projects\2-Time Series Data Analysis-20220907T085235Z-001\2-Time Series Data Analysis\individual_stocks_5yr\GOOG_data.csv')
goog.head()
| date | open | high | low | close | volume | Name | |
|---|---|---|---|---|---|---|---|
| 0 | 2014-03-27 | 568.000 | 568.00 | 552.92 | 558.46 | 13052 | GOOG |
| 1 | 2014-03-28 | 561.200 | 566.43 | 558.67 | 559.99 | 41003 | GOOG |
| 2 | 2014-03-31 | 566.890 | 567.00 | 556.93 | 556.97 | 10772 | GOOG |
| 3 | 2014-04-01 | 558.710 | 568.45 | 558.71 | 567.16 | 7932 | GOOG |
| 4 | 2014-04-02 | 565.106 | 604.83 | 562.19 | 567.00 | 146697 | GOOG |
close = pd.DataFrame()
close['aapl'] = aapl['close']
close['amzn'] = amzn['close']
close['msft'] = msft['close']
close['goog'] = goog['close']
close.head()
| aapl | amzn | msft | goog | |
|---|---|---|---|---|
| 0 | 67.8542 | 261.95 | 27.55 | 558.46 |
| 1 | 68.5614 | 257.21 | 27.86 | 559.99 |
| 2 | 66.8428 | 258.70 | 27.88 | 556.97 |
| 3 | 66.7156 | 269.47 | 28.03 | 567.16 |
| 4 | 66.6556 | 269.24 | 28.04 | 567.00 |
import seaborn as sns
sns.pairplot(data=close)
<seaborn.axisgrid.PairGrid at 0x19d10493760>
sns.heatmap(close.corr(),annot=True)
#conclusion is amzn and msft are most corr and goog and aapl are least.
<AxesSubplot:>
#analysing the daily return of stocks and their correlation
data = pd.DataFrame()
data['aapl_change'] = ((aapl['close'] - aapl['open'])/aapl['close'])*100
data['amzn_change'] = ((amzn['close'] - amzn['open'])/amzn['close'])*100
data['msft_change'] = ((msft['close'] - msft['open'])/msft['close'])*100
data['goog_change'] = ((goog['close'] - goog['open'])/goog['close'])*100
data.head()
| aapl_change | amzn_change | msft_change | goog_change | |
|---|---|---|---|---|
| 0 | 0.206325 | 0.209964 | 0.725953 | -1.708269 |
| 1 | 0.714688 | -2.328836 | 0.753769 | -0.216075 |
| 2 | -2.481344 | -0.189409 | 0.000000 | -1.781065 |
| 3 | -0.042869 | 2.946525 | 0.356761 | 1.489879 |
| 4 | 0.443624 | 0.694548 | 0.427960 | 0.334039 |
sns.pairplot(data=data)
<seaborn.axisgrid.PairGrid at 0x19d13155b20>
sns.heatmap(data.corr(),annot=True)
<AxesSubplot:>
#value at risk analysis for different tech companies
sns.distplot(data['aapl_change'])
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
<AxesSubplot:xlabel='aapl_change', ylabel='Density'>
#finding standard deviation
data['aapl_change'].std
#approx 68% data
<bound method NDFrame._add_numeric_operations.<locals>.std of 0 0.206325
1 0.714688
2 -2.481344
3 -0.042869
4 0.443624
...
1254 0.366551
1255 -3.426791
1256 -1.667838
1257 5.029749
1258 -2.222013
Name: aapl_change, Length: 1259, dtype: float64>
data['aapl_change'].std()*2
#approx 95% data
2.3742754262842474
data['aapl_change'].std()*3
#approx 98% of entire data
3.561413139426371
data['aapl_change'].quantile(0.1)
#says that 90% of the time, worst daily loss wont exceed this value.
-1.4246644227944307
data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| aapl_change | 1259.0 | -0.000215 | 1.187138 | -7.104299 | -0.658021 | 0.042230 | 0.715427 | 8.000388 |
| amzn_change | 1259.0 | -0.000398 | 1.358679 | -9.363077 | -0.738341 | -0.002623 | 0.852568 | 5.640265 |
| msft_change | 1259.0 | 0.076404 | 1.059260 | -5.177618 | -0.509241 | 0.061069 | 0.703264 | 4.861491 |
| goog_change | 975.0 | -0.012495 | 1.092560 | -5.952266 | -0.551963 | 0.024951 | 0.672649 | 4.943550 |